#File to estimate smoothed polling averages over time and create estimates for election outcomes

library(countrycode)
library(lubridate)
library(ggplot2)
library(reshape)
library(caret)
library(lme4)

dat <- read.csv("global_polling_replication_data.csv")
elections.training <- read.csv("elections_training_wround09082016.csv")

# Step: create essential features
dat$wts <- scale(1/(dat$days.to.elec+2), center=F) #+ scale(1/((abs(dat$pro.incumbent)+1) ), center=F) +scale(dat$n.elecs, center=F) #should do this by election
dat$pro.incumbent <- scale(dat$pro.incumbent, center=F)
dat$sm.pollmarg <- NA
dat$Round.Date <- as.Date(as.character(dat$Round.Date), format="%m/%d/%Y")

###########
#Smoothing the polls based on similar elections, weighting by the recency of the poll
############
trainingsize <- 50
bad <- 1
# Step: creating the smoothed estimate for the training set
# Step: create loop creating smoothed estimate for each election in the testing data
get_preds <- function(pdat=dat, trainingset=elections.training, trainingsize= 50 , bad = 1 , leadTime = 0){
  pdat$wts <- scale(1/(pdat$days.to.elec-leadTime+2), center=F) #
  
  # Remove missing polls and dates
  pdat <- pdat[!is.na(pdat$pollmarg), ]
  pdat <- pdat[!is.na(pdat$Poll.Date), ]
  
  for(i in 1:sum(elections.training$trainingset==0)){
    
    elecs <- pdat$electionid %in% elections.training$electionid[ 1:(trainingsize+i)] #get T/F in or out of training set
    
    #########
    sm.mod <- try(lmer(pollmarg ~ pro.incumbent + (pro.incumbent|incRun) + (1|asp.gdp.bin) +(1|electionid)+(1|ccode)+(1|Pollster)+(1|region), na.action=na.exclude, weights=wts, data=pdat[ elecs, ], control=lmerControl(optimizer="bobyqa", boundary.tol = 1e-2, check.scaleX="silent.rescale")) )
    #error handling
    w <- warnings(); assign("last.warning", NULL, envir = baseenv()) #assign warning to w, then delete all warnings
    
    #######
    if(is.null(w)==F){ #if there is a warning, then run this model - no CCODE variable
      print(paste("warning", i, "did not converge"))
      w <- NULL
      assign("last.warning", NULL, envir = baseenv())
    }
    #######
    if(i==1){ #if it is the first round of the loop, predict the training set plus the first test election
      pdat$sm.pollmarg[ elecs] <- predict(sm.mod, newdata=pdat[ elecs,])    
    }
    #######
    next.elec <- pdat$electionid %in% elections.training$electionid[ trainingsize + i ] #get just the election to make smoothed pred.

    if(i>1 & nrow(pdat[next.elec, ])>0){ #if it is not the first round of the loop, predict just the next test election
      pdat$sm.pollmarg[ next.elec ] <- predict(sm.mod, newdata=pdat[next.elec, ]) #produces a prediction
      print(elections.training$electionid[ trainingsize + i ])
    }
    print(paste(i, "successfully completed"))
  }
  return(pdat)
}

#pdat1 <- get_preds(pdat=dat, trainingset=elections.training, trainingsize=50, bad=1, leadTime = 0)

#Generate plot of estimated values from iterated smoothing
sm.agg <- aggregate(cbind(sm.pollmarg, pollmarg, Round.Date, realmarg, year, region=as.numeric(as.factor(region)),  multiparty, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat1)
sum(sm.agg$sm.pollmarg*sm.agg$realmarg >= 0, na.rm=T) / sum(!is.na(sm.agg$sm.pollmarg))
qplot(sm.pollmarg, realmarg, data=sm.agg, colour=year) + geom_smooth(method="loess") + ylab("Real Margin") + xlab("Smoothed Margin (post-averaging)")

#################
# MODEL of outcome using smoothed polling data
################
library(caret); set.seed(13243)

run.finalsynthetic <- function(data=sm.agg){
  data <- data[order(data$Round.Date), ]
  trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
  #linear partial least squares
  trmode <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation, data=data, trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 
  return(trmode)
}
# This is just the standard model, for reference, not a figure in the paper. 
trmode <- run.finalsynthetic()
sum( (trmode$pred$pred* trmode$pred$obs) > 0) / length(trmode$pred$pred)
trmode$pred$result <- ifelse((trmode$pred$pred* trmode$pred$obs) > 0, "correct", "incorrect")
qplot(pred, obs, data=trmode$pred, colour=result) + geom_smooth(method="loess") + ylab("Observed Margin of Incumbent") + xlab("Model Prediction")

########   Figure S14 **********
################################
# LIMIT THE POLLING DATA TO SPECIFIC TIME WINDOWS: greater than one week, greater than one month, greater than two months
########
# NB, need to re-transform the weights, because they are probably not scaled properly **OK, FIXED

pdat.week <- get_preds(pdat=dat[which(dat$days.to.elec>=7), ], leadTime=7)
sm.agg <- aggregate(cbind(sm.pollmarg, pollmarg, Round.Date, realmarg, year, region=as.numeric(as.factor(region)), multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat.week)
weekmod <- run.finalsynthetic()
mod <- weekmod
postResample(mod$pred$pred, mod$pred$obs)
mod$pred$result <- ifelse((mod$pred$pred* mod$pred$obs) > 0, "correct", "incorrect")
p1 = qplot(pred, obs, data=mod$pred, colour=result) + geom_smooth(method="loess") + ylab("Observed Margin of Incumbent") + xlab("Model Prediction with Second Stage OLS") + theme(legend.position=c(.1, .9), legend.title=element_blank())
sum( (weekmod$pred$pred* weekmod$pred$obs) > 0) / length(weekmod$pred$pred)


pdat.twoweek <- get_preds(pdat=dat[which(dat$days.to.elec>=14), ], leadTime=14)
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, pollmarg, realmarg, year, region=as.numeric(as.factor(region)), incRun, incApp, incExtend,  multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat.twoweek)
twoweekmod <- run.finalsynthetic()
mod <- twoweekmod
postResample(mod$pred$pred, mod$pred$obs)
mod$pred$result <- ifelse((mod$pred$pred* mod$pred$obs) > 0, "correct", "incorrect")
p2 = qplot(pred, obs, data=mod$pred, colour=result) + geom_smooth(method="loess") + ylab("Observed Margin of Incumbent") + xlab("Model Prediction with Second Stage OLS") + theme(legend.position=c(.1, .9), legend.title=element_blank())
sum( (twoweekmod$pred$pred* twoweekmod$pred$obs) > 0) / length(twoweekmod$pred$pred)


pdat.month <- get_preds(pdat=dat[which(dat$days.to.elec>=30), ])
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, pollmarg, realmarg, year, region=as.numeric(as.factor(region)), incRun, incApp, incExtend,  multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat.month)
monthmod <- run.finalsynthetic()
mod <- monthmod
postResample(mod$pred$pred, mod$pred$obs)
mod$pred$result <- ifelse((mod$pred$pred* mod$pred$obs) > 0, "correct", "incorrect")
p3 = qplot(pred, obs, data=mod$pred, colour=result) + geom_smooth(method="loess") + ylab("Observed Margin of Incumbent") + xlab("Model Prediction with Second Stage OLS") + theme(legend.position=c(.1, .9), legend.title=element_blank())
sum( (monthmod$pred$pred* monthmod$pred$obs) > 0) / length(monthmod$pred$pred)


pdat.2months <- get_preds(pdat=dat[which(dat$days.to.elec>=60), ])
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, pollmarg, realmarg, year, region=as.numeric(as.factor(region)), incRun, incApp, incExtend,  multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat.2months)
twomonthmod <- run.finalsynthetic()
mod <- twomonthmod
postResample(mod$pred$pred, mod$pred$obs)
mod$pred$result <- ifelse((mod$pred$pred* mod$pred$obs) > 0, "correct", "incorrect")
p4 = qplot(pred, obs, data=mod$pred, colour=result) + geom_smooth(method="loess", span=2) + ylab("Observed Margin of Incumbent") + xlab("Model Prediction with Second Stage OLS") + theme(legend.position=c(.1, .9), legend.title=element_blank())
sum( (twomonthmod$pred$pred* twomonthmod$pred$obs) > 0) / length(twomonthmod$pred$pred)


################# END **********************************************
